library(tidyverse)
library(readr)
library(modelr)
day <- read_csv("./day.csv")
temp_orig, feel_temp_orig, hum_orig, windspeed_orig.day <- (day %>%
mutate(temp_orig = temp * 39,
feel_temp_orig = atemp * 50,
hum_orig = hum * 100,
windspeed_org = windspeed * 67))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 1 0 1 0 6 0 2
## 2 2 2011-01-02 1 0 1 0 0 0 2
## 3 3 2011-01-03 1 0 1 0 1 1 1
## 4 4 2011-01-04 1 0 1 0 2 1 1
## 5 5 2011-01-05 1 0 1 0 3 1 1
## 6 6 2011-01-06 1 0 1 0 4 1 1
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
weathersit variable to be a factor with the values “clear”, “mist”, “light precip”, and “heavy precip”, based on the variable definitions. This factor should have the ordering “clear”, “mist”, “light precip”, “heavy precip”.day$weathersit <- factor(day$weathersit,
levels = c(1, 2, 3, 4),
labels = c('clear', 'mist', 'light precip', 'heavy precip'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1 2011-01-01 1 0 1 0 6 0 mist
## 2 2 2011-01-02 1 0 1 0 0 0 mist
## 3 3 2011-01-03 1 0 1 0 1 1 clear
## 4 4 2011-01-04 1 0 1 0 2 1 clear
## 5 5 2011-01-05 1 0 1 0 3 1 clear
## 6 6 2011-01-06 1 0 1 0 4 1 clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
day$workingday <- factor(day$workingday,
levels = c(1, 0),
labels = c('work day', 'weekend/holiday'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 0 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 0 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 0 1 0 1 work day clear
## 4 4 2011-01-04 1 0 1 0 2 work day clear
## 5 5 2011-01-05 1 0 1 0 3 work day clear
## 6 6 2011-01-06 1 0 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
day$yr <- factor(day$yr,
levels = 0:1,
labels = c('2011', '2012'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
dteday to be recognized as dates by R.# dteday is already recognized as a date.
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
dteday to create a new variable, days, that is the number of days since January 1, 2011. Note that you can subtract two dates. Keep in mind that the value stored in days should be stored as a number.day <- (day %>%
mutate(days = as.numeric(day$dteday - as.Date("11-01-01", "%y-%m-%d"))))
head(day)
## # A tibble: 6 × 21
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 12 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>,
## # days <dbl>
Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and the temperature (temp_orig).
ggplot(day) +
geom_point(aes(x = temp_orig, y = cnt, col = temp)) +
labs(x = 'Bike rentals',
y = 'Temperature in Celcius',
title = 'Bike Rentals in DC in 2011 and 2012',
subtitle = 'Warmer weather leads to more rentals')
temp_orig and cnt variables from your modified bikes data frame.bt <- data.frame(day['temp_orig'], day['cnt'])
head(bt)
## temp_orig cnt
## 1 13.422513 985
## 2 14.175642 801
## 3 7.658196 1349
## 4 7.800000 1562
## 5 8.851323 1600
## 6 7.969572 1606
cnt, is the dependent (response) variable and the original temperature in Celsius, temp_orig, is the independent (predictor) variable. Save these with different names.# Linear
line_day <- lm(cnt ~ temp_orig, data = bt)
line_coe <- coefficients(line_day)
# Quadratic
bt2 <- (bt %>%
mutate(tsqrt = temp_orig ** 2))
quad_day <- model_matrix(cnt ~ temp_orig + tsqrt, data = bt2)
pred <- lm(cnt ~ temp_orig + tsqrt, data = bt2)
quad_day <- quad_day %>%
add_predictions(pred)
grid data frame with values of temp_orig (cut into 20 evenly spaced values) and separately add the predictions from both models to this data frame. NOTE: Do not use gather_predictions here, as Dr. McNelis wants the model predictions to be in two different columns. The predictions from the linear model should be labeled “linpred” and those from the quadratic model labelled as “quadpred”. Note, the add_predictions function allows you to specify the name of variable containing the predictions.temp_grid <- (bt %>%
data_grid(temp_orig = seq_range(temp_orig, 20)))
temp_grid <- (temp_grid %>%
add_predictions(line_day, var = 'linpred'))
temp_grid <- (temp_grid %>%
mutate(tsqrt = temp_orig ^ 2) %>%
add_predictions(pred, var = 'quadpred') %>%
select(-tsqrt))
temp_grid
## # A tibble: 20 × 3
## temp_orig linpred quadpred
## <dbl> <dbl> <dbl>
## 1 2.31 1607. -689.
## 2 3.95 1888. 113.
## 3 5.60 2168. 862.
## 4 7.25 2449. 1556.
## 5 8.90 2729. 2197.
## 6 10.5 3010. 2785.
## 7 12.2 3290. 3318.
## 8 13.8 3571. 3798.
## 9 15.5 3851. 4224.
## 10 17.1 4132. 4597.
## 11 18.8 4412. 4915.
## 12 20.4 4693. 5180.
## 13 22.1 4973. 5391.
## 14 23.7 5254. 5549.
## 15 25.4 5534. 5653.
## 16 27.0 5815. 5703.
## 17 28.7 6095. 5699.
## 18 30.3 6376. 5642.
## 19 32.0 6656. 5531.
## 20 33.6 6937. 5366.
temp_orig versus cnt and add curves for your linear and quadratic models from your grid data frame. Have a nice title, subtitle, and axes labels on your graph.ggplot(day, aes(x = temp_orig, y = cnt, col = temp)) +
geom_point(alpha = .5) +
geom_line(data = temp_grid, aes(x = temp_orig, y = quadpred), color = "red") +
geom_line(data = temp_grid, aes(x = temp_orig, y = linpred), color = 'blue') +
labs(x = 'Temperature in Celcius',
y = 'Bike rentals',
title = "Comparing Models",
subtitle = "Blue is linear and red is quadratic.")
bt <- (bt %>%
add_residuals(line_day, var = 'line') %>%
mutate(tsqrt = temp_orig ^ 2) %>%
add_residuals(pred, var = 'quad') %>%
select(-tsqrt))
bt_long <- (bt %>%
pivot_longer(cols = c('line', 'quad'),
names_to = 'model',
values_to = 'resids'))
bt_long
## # A tibble: 1,462 × 4
## temp_orig cnt model resids
## <dbl> <dbl> <chr> <dbl>
## 1 13.4 985 line -2515.
## 2 13.4 985 quad -2697.
## 3 14.2 801 line -2827.
## 4 14.2 801 quad -3089.
## 5 7.66 1349 line -1170.
## 6 7.66 1349 quad -372.
## 7 7.8 1562 line -981.
## 8 7.8 1562 quad -215.
## 9 8.85 1600 line -1122.
## 10 8.85 1600 quad -581.
## # … with 1,452 more rows
ggplot(bt_long) +
geom_point(aes(x = temp_orig, y = resids, col = model)) +
geom_ref_line(h = 0) +
facet_grid(~model) +
labs(x = 'Temperature in Celcius',
y = 'Residual',
title = "Residuals of the linear and quadratic models",
subtitle = "Blue is linear and red is quadratic.") +
theme(legend.position = 'none')
Overall the quadratic model seems like it represents the data better. The amount of rentals flattens out and decreases a little after 20 degrees, the linear model does not capture that at all. The linear model’s residuals also creates a pretty defined quadratic curve, whereas the quadratic model’s residuals are distributed better.
Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and time, in terms of the number of days since January 1, 2011 (days).
ggplot(day) +
geom_point(aes(x = days, y = cnt, col = temp_orig)) +
labs(x = 'Days since January 1, 2011',
y = 'Bike rentrals',
title = 'Bike Rentals in CD, 2011 and 2012',
subtitle = 'Rentals trends over time')
bd (for bike days) that contains the days and cnt variables from your modified bikes data frame.bd <- data.frame(day['days'], day['cnt'])
fit_order_n that takes an input of an integer n and returns a scatter plot of our data (like that in problem 1) with our nth order polynomial plotted along with the data. Make sure that the graph includes a TITLE that indicates the order of the polynomial (hint: check out help on the paste function).fit_order_n <- function(n) {
fit <- lm(cnt ~ poly(days, degree = n), data = bd)
p_grid <- data_grid(bd, days)
p_grid <- (bd %>%
gather_predictions(fit))
ggplot(p_grid) +
geom_point(aes(x = days, y = cnt), alpha = 0.2) +
geom_line(aes(x = days, y = pred), col = 'red') +
labs(x = 'Days since January 1, 2011',
y = 'Bike rentrals',
title = 'Estimating Bike Rentals in DC',
subtitle = paste('using ', n,
ifelse(n == 1, 'st',
ifelse(n == 2, 'nd',
ifelse(n == 3, 'rd', 'th'))),
' order polynomial', sep = ''))
}
# I couldn't get a for loop to work
fit_order_n(1)
fit_order_n(2)
fit_order_n(3)
fit_order_n(4)
fit_order_n(5)
fit_order_n(6)
fit_order_n(7)
fit_order_n(8)
fit_order_n(9)
fit_order_n(10)
Using a 4th order polynomial seems to be the minimum that fits our data, but using a 6th order polynomial seems to be a substainal improvement over 4th and 5th, and then more or less equal to higher order polynomials.
Rather than thinking that a polynomial may be a good fit, you may think that a sinusoidal function might be a good fit, as there is a natural period in 365.25 days in a year. We’ll use the following to help us determine the best sinusoidal model, with formula y = Asin(Bx + φ) + k where A is the amplitude, \(\frac{2π}{|B|}\) is the period, φ is the phase shift, and k is the midline. We can use the trigonometric identity sin(α + β) = sin(α) cos(β) + cos(α) sin(β) to get that